package com.kdtech.analyse.news;
import com.kdtech.analyse.AnalyseNews;
import com.kdtech.utils.HtmlCleaner;


import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import com.kdtech.analyse.JSoupUtils;
import com.kdtech.crawler.CrawlHTML;
import com.kdtech.crawler.at.UrlArgumentTop;
import com.kdtech.entity.crawler.UrlMeta;
import com.kdtech.entity.data.NewsMeta;
import com.kdtech.utils.DateUtils;
import com.kdtech.utils.NumberUtils;
import com.kdtech.utils.StringUtils;
import com.kdtech.utils.HtmlCleaner;

/**
 * 新疆网新闻 解析类
 *
 * @author Chase
 *
 */
public class IyaxinNewsAnalyse implements AnalyseNews {

	public static void main(String[] args) {
		IyaxinNewsAnalyse parser=new IyaxinNewsAnalyse();
		String Url="http://news.iyaxin.com/content/2016-11/08/content_10153746.htm";
		UrlMeta oneMeta=CrawlHTML.responseToURL(Url);
		System.out.println(parser.parserHtml(oneMeta));
	}

	
	public boolean isDetailPage(String url) {
		boolean bRet=false;
		String[] regex={
				"http://news.iyaxin.com/content/[0-9]{4}-[0-9]{2}/[0-9]{2}/content_[0-9]*.htm",
				"http://ent.iyaxin.com/content/[0-9]{4}-[0-9]{2}/[0-9]{2}/content_[0-9]*.htm",
				"http://law.iyaxin.com/content/[0-9]{4}-[0-9]{2}/[0-9]{2}/content_[0-9]*.htm",
				"http://money.iyaxin.com/[0-9]{4}-[0-9]{2}/[0-9]{2}/content_[0-9]*.htm",
				"http://money.iyaxin.com/index.php[?]a=show&c=index&catid=[0-9]+&id=[0-9]+&m=content",
				"http://www.iyaxin.com/content/[0-9]{4}-[0-9]{2}/[0-9]{2}/content_[0-9]*.htm",
				"http://travel.iyaxin.com/content/[0-9]{4}-[0-9]{2}/[0-9]{2}/content_[0-9]*.htm",
				"http://news.iyaxin.com/content/[0-9]{4}-[0-9]{2}/[0-9]{2}/content_[0-9]*_[0-9]*.htm",
				"http://pinglun.iyaxin.com/content/[0-9]{4}-[0-9]{2}/[0-9]{2}/content_[0-9]*.htm",
				"http://.*.iyaxin.com/content/[0-9]{4}-[0-9]{2}/[0-9]{2}/content_[0-9]*.htm",
				"http://[a-z]+.iyaxin.com/archive.php[?]aid=[0-9]*",
				};
		for (int i=0; i < regex.length; i++) {
			if (url.matches(regex[i])) {
				bRet=true;
				break;
			}
		}
		if (bRet){
			if (url.startsWith("http://photo.iyaxin.com"))bRet=false;
		}
		return bRet;
	}

	
	public NewsMeta parserHtml(UrlMeta urlMeta) {

		String url=urlMeta.getUrl();
		/**
		 * 判断是否为详细页，不是详细页面直接返回空
		 */
		if (!isDetailPage(url)) {
			System.err.println("不符合规则");
		}

		String html=urlMeta.getHtml();

		/*
		 * 定义新闻实体需要的字段
		 */
		String title=null;
		String content=null;
		Long dateLong=null;
		String commentNum=null;
		String clickNum=null;

		Document doc=Jsoup.parse(html);
		if (url.startsWith("http://auto.iyaxin.com/")) {
			title=doc.select("h3.zxfztitle ").text();// 标题
			dateLong=DateUtils.matchDate(doc.select("div.zxinfo").text());// 时间
			content=HtmlCleaner.getContentHtml(url,doc.select("div.zxfzcontent")); //内容
			if (StringUtils.isNullOrEmpty(title) && StringUtils.isNullOrEmpty(content)) {
				title=doc.select("div.zhuti>h1").text();// 标题
				dateLong=DateUtils.matchDate(doc.select("div.z_left").text());// 时间
				content=HtmlCleaner.getContentHtml(url,doc.select("div.wenzhang")); //内容
			}
		} else if (url.startsWith("http://edu.iyaxin.com/")) {
			title=doc.select("div#newsnr>div.title").text();// 标题
			dateLong=DateUtils.matchDate(url);// 时间
			if (dateLong == null || dateLong == 0) {
				dateLong=DateUtils.matchDate(doc.select("div.articleExplain>span").text());// 时间
			}



			content=HtmlCleaner.getContentHtml(url,doc.select("div.articleContent")); //内容
		} else if (url.startsWith("http://house.iyaxin.com")) {
			title=doc.select("div#title").text();// 标题
			dateLong=DateUtils.matchDate(doc.select("div#riqi").text());// 时间
			content=HtmlCleaner.getContentHtml(url,doc.select("div#zoom")); //内容
			if (StringUtils.isNullOrEmpty(title) && StringUtils.isNullOrEmpty(content)) {
				title=doc.select("div.appraise").text();// 标题
				dateLong=DateUtils.matchDate(doc.select("div.line").text());// 时间
				content=HtmlCleaner.getContentHtml(url,doc.select("span#comp-paste-div-4916")); //内容
			}
		} else if (url.startsWith("http://money.iyaxin.com")) {
			title=doc.select("h1").text();// 标题
			dateLong=DateUtils.matchDate(doc.select("span").text());// 时间
			content=HtmlCleaner.getContentHtml(url,doc.select("div.content")); }//内容
		else {
			title=doc.select("div.zhuti>h1").text();// 标题
			dateLong=DateUtils.matchDate(doc.select("div.z_left").text());// 时间
			content=HtmlCleaner.getContentHtml(url,doc.select("div#text")); //内容

			//	commentNum=doc.select("em#comNum").text(); //评论数
			if (StringUtils.isNullOrEmpty(title) && StringUtils.isNullOrEmpty(content)) {
				title=doc.select("div#newsnr").text();// 标题
				dateLong=DateUtils.matchDate(doc.select("div#nav_tittle1").text());// 时间
				content=HtmlCleaner.getContentHtml(url,doc.select("div#paassages")); //内容
			}

			if (StringUtils.isNullOrEmpty(title) || StringUtils.isNullOrEmpty(content)) {
				title=doc.select("div#newsnr>h1").text();// 标题
				dateLong=DateUtils.matchDate(doc.select("div#newsnr>p").text());// 时间
				content=HtmlCleaner.getContentHtml(url,doc.select("div.setoplaysdx")); //内容
			}

			if (StringUtils.isNullOrEmpty(title) || StringUtils.isNullOrEmpty(content)) {
				title=doc.select("div.con_biaoti").text();// 标题
				dateLong=DateUtils.matchDate(doc.select("div#nav_tittle1").text());// 时间
				content=HtmlCleaner.getContentHtml(url,doc.select("div#newsnr")); //内容
			}
		}
		if (StringUtils.isNullOrEmpty(title)){
			title=doc.select("td.td01 font[style=font-size:18px; font-weight:bold;]").text();// 标题
		}
		if (StringUtils.isNullOrEmpty(title)){
			title=doc.select("table tbody tr td table tbody tr td font").text();// 标题
		}
		if (StringUtils.isNullOrEmpty(title)){
			title=doc.select("h1").text();// 标题
		}
		if (StringUtils.isNullOrEmpty(content)){
			content=HtmlCleaner.getContentHtml(url,doc.select("td#gqtjinfo"));// 标题
		}
		if (StringUtils.isNullOrEmpty(content)){
			content=HtmlCleaner.getContentHtml(url,doc.select(".article-detail"));// 标题
		}
		if (StringUtils.isNullOrEmpty(content)){
			content=HtmlCleaner.getContentHtml(url,doc.select("td.td01 font[style=font-size:14px; line-height:21px;]"));// 标题
		}
		if (dateLong==null){
			dateLong=DateUtils.matchDate(url);
		}

		String author=JSoupUtils.matchAuthor(doc, "来源：");
		NewsMeta meta=new NewsMeta();
		meta.setTitle(title);
		meta.setUrl(url);
		meta.setType(1);
		meta.setClickNum(NumberUtils.parseInt(clickNum));
		meta.setCommentNum(NumberUtils.parseInt(commentNum));
		meta.setContent(content);
		meta.setDate(dateLong);
		meta.setAuthor(author);
		return meta;

	}

	
	public NewsMeta Update(NewsMeta meta) {
		return null;
	}

	
	public boolean isNeedUpdate(){
		return false;
	}
}
